import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy import stats
# use seaborn plotting defaults
import seaborn as sns; sns.set()
%matplotlib inline
svm_data = pd.read_csv("data/svm.csv")
# this crashed every time I ran it
# but it worked long enough to see that all values are numeric,
# there are no missing values,
# and that the data needs to be normalized
# svm_data.profile_report()
svm_columns = svm_data.columns
svm_features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension']
svm_target = 'target'
min_max_scaler = preprocessing.MinMaxScaler()
svm_data_clean = pd.DataFrame(data=min_max_scaler.fit_transform(svm_data.values), columns=svm_data.columns)
plt.rcParams["figure.figsize"]=20,20
plt.figure(0, clear=True, figsize=(10, 10), dpi=100, tight_layout=False, constrained_layout=False)
plt.matshow(svm_data.corr())
plt.xticks(np.arange(len(svm_columns)), svm_columns, rotation=90)
plt.yticks(np.arange(len(svm_columns)), svm_columns)
fig = plt.figure(figsize=(10, 10), dpi=100, )
fig.show()
plt.close()
<Figure size 1000x1000 with 0 Axes>
for x_index, x_feature in enumerate(svm_features):
for y_index, y_feature in enumerate(svm_features):
if x_index < y_index:
plt.scatter(svm_data[x_feature].values, svm_data[y_feature].values, cmap="bwr", c=svm_data[svm_target].values)
plt.title(f"{x_feature} vs {y_feature}")
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.show()
def score_result(description, classifier, x, y):
print(" "*15,description)
score = classifier.score(x, y)
print("Accuracy Score: ", score)
predicted = classifier.predict(x)
cm = confusion_matrix(y, predicted)
print(cm)
report = classification_report(y, predicted)
print(report)
group_names = ["True Neg","False Pos","False Neg","True Pos"]
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt="", cmap='Blues', cbar=False)
plt.show()
svm_train_x, svm_test_x, svm_train_y, svm_test_y = train_test_split(svm_data_clean[svm_features], svm_data_clean[svm_target], train_size=0.3, random_state=0)
from sklearn.svm import LinearSVC
linear_train_x, linear_test_x, linear_train_y, linear_test_y = train_test_split(svm_data_clean[svm_features], svm_data_clean[svm_target], train_size=0.3, random_state=0)
lsvc = LinearSVC()
lsvc.fit(linear_train_x, linear_train_y)
score_result("Linear Training Set", lsvc, linear_train_x, linear_train_y)
print("~"*60)
score_result("Linear Test Set", lsvc, linear_test_x, linear_test_y)
Linear Training Set
Accuracy Score: 1.0
[[ 66 0]
[ 0 104]]
precision recall f1-score support
0.0 1.00 1.00 1.00 66
1.0 1.00 1.00 1.00 104
accuracy 1.00 170
macro avg 1.00 1.00 1.00 170
weighted avg 1.00 1.00 1.00 170
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Linear Test Set
Accuracy Score: 0.9699248120300752
[[136 10]
[ 2 251]]
precision recall f1-score support
0.0 0.99 0.93 0.96 146
1.0 0.96 0.99 0.98 253
accuracy 0.97 399
macro avg 0.97 0.96 0.97 399
weighted avg 0.97 0.97 0.97 399
from sklearn.svm import SVC
poly_train_x, poly_test_x, poly_train_y, poly_test_y = train_test_split(svm_data_clean[svm_features], svm_data_clean[svm_target], train_size=0.3, random_state=0)
psvc = SVC(kernel="poly", degree=2)
psvc.fit(poly_train_x, poly_train_y)
score_result("Polynomial Training Set", psvc, poly_train_x, poly_train_y)
print("~"*60)
score_result("Polynomial Test Set", psvc, poly_test_x, poly_test_y)
Polynomial Training Set
Accuracy Score: 1.0
[[ 66 0]
[ 0 104]]
precision recall f1-score support
0.0 1.00 1.00 1.00 66
1.0 1.00 1.00 1.00 104
accuracy 1.00 170
macro avg 1.00 1.00 1.00 170
weighted avg 1.00 1.00 1.00 170
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Polynomial Test Set
Accuracy Score: 0.9624060150375939
[[136 10]
[ 5 248]]
precision recall f1-score support
0.0 0.96 0.93 0.95 146
1.0 0.96 0.98 0.97 253
accuracy 0.96 399
macro avg 0.96 0.96 0.96 399
weighted avg 0.96 0.96 0.96 399
from sklearn.svm import SVC
rbf_train_x, rbf_test_x, rbf_train_y, rbf_test_y = train_test_split(svm_data_clean[svm_features], svm_data_clean[svm_target], train_size=0.3, random_state=0)
rbfsvc = SVC(kernel="rbf", gamma=0.5, C=0.1)
rbfsvc.fit(poly_train_x, poly_train_y)
score_result("RBF Training Set", rbfsvc, rbf_train_x, rbf_train_y)
print("~"*60)
score_result("RBF Test Set", rbfsvc, rbf_test_x, rbf_test_y)
RBF Training Set
Accuracy Score: 0.9352941176470588
[[ 55 11]
[ 0 104]]
precision recall f1-score support
0.0 1.00 0.83 0.91 66
1.0 0.90 1.00 0.95 104
accuracy 0.94 170
macro avg 0.95 0.92 0.93 170
weighted avg 0.94 0.94 0.93 170
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RBF Test Set
Accuracy Score: 0.9398496240601504
[[124 22]
[ 2 251]]
precision recall f1-score support
0.0 0.98 0.85 0.91 146
1.0 0.92 0.99 0.95 253
accuracy 0.94 399
macro avg 0.95 0.92 0.93 399
weighted avg 0.94 0.94 0.94 399
All three performed well on this dataset, though RBF was definitely the worst.
Linear and Polynomial were very similar, but Polynomial had more False Negatives.
This tells me that this problem is a linear one, and the more intense RBF is overcomplicating it.
tree_data = pd.read_csv("data/decisionTree.csv")
tree_data.profile_report()
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
le = preprocessing.LabelEncoder()
tree_features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
tree_target = 'target_class'
tree_target_names = tree_data[tree_target].unique()
tree_data_encoded = tree_data.apply(le.fit_transform)
def create_decision_tree(description, classifier, x, y):
print(" "*20, description)
predicted = classifier.predict(x)
report = classification_report(y, predicted, zero_division=0)
print(report)
fig_tree = plt.figure(figsize=(25,20))
pt = plot_tree(classifier, feature_names=tree_features, class_names=tree_target_names, filled=True)
plt.show()
gini_train_x, gini_test_x, gini_train_y, gini_test_y = train_test_split(tree_data_encoded[tree_features], tree_data_encoded[tree_target], train_size=0.3, random_state=0)
gini_classifier = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=0)
gini_classifier.fit(gini_train_x, gini_train_y)
create_decision_tree("Gini Train", gini_classifier, gini_train_x, gini_train_y)
print("~"*60)
create_decision_tree("Gini Test", gini_classifier, gini_test_x, gini_test_y)
Gini Train
precision recall f1-score support
0 0.59 0.52 0.55 113
1 0.00 0.00 0.00 20
2 0.84 0.94 0.89 374
3 0.00 0.00 0.00 11
accuracy 0.80 518
macro avg 0.36 0.37 0.36 518
weighted avg 0.74 0.80 0.76 518
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Gini Test
precision recall f1-score support
0 0.51 0.54 0.52 271
1 0.00 0.00 0.00 49
2 0.83 0.92 0.88 835
3 0.00 0.00 0.00 54
accuracy 0.76 1209
macro avg 0.34 0.36 0.35 1209
weighted avg 0.69 0.76 0.72 1209
ent_train_x, ent_test_x, ent_train_y, ent_test_y = train_test_split(tree_data_encoded[tree_features], tree_data_encoded[tree_target], train_size=0.3, random_state=0)
ent_classifier = DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state = 0)
ent_classifier.fit(ent_train_x, ent_train_y)
create_decision_tree("Entropy Train", ent_classifier, gini_train_x, gini_train_y)
print("~"*60)
create_decision_tree("Entropy Test", ent_classifier, gini_test_x, gini_test_y)
Entropy Train
precision recall f1-score support
0 0.59 0.52 0.55 113
1 0.00 0.00 0.00 20
2 0.84 0.94 0.89 374
3 0.00 0.00 0.00 11
accuracy 0.80 518
macro avg 0.36 0.37 0.36 518
weighted avg 0.74 0.80 0.76 518
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Entropy Test
precision recall f1-score support
0 0.51 0.54 0.52 271
1 0.00 0.00 0.00 49
2 0.83 0.92 0.88 835
3 0.00 0.00 0.00 54
accuracy 0.76 1209
macro avg 0.34 0.36 0.35 1209
weighted avg 0.69 0.76 0.72 1209